Preprocessing Dataset
Libraries
Library for read dataset.
library(readr)Library for data frames processing.
library(dplyr)
library(tidyr)Library for R Markdown.
library(rmarkdown)
library(knitr)Library for data presentation.
library(scales)Library for manage strings.
library(stringr)Load dataset
Load dataset.
df <- read_csv("../virusTotal/data/virusTotal.csv")Statistics
Dimensions.
dim(df)## [1] 183 447
Types
View witch types are in the dataset.
col_types_all <-
df %>%
sapply(typeof) %>%
unlist()
col_types_table <-
col_types_all %>%
table()
col_types <-
col_types_table %>%
as.vector()
names(col_types) <- names(col_types_table)## character double logical
## 158 204 85
As can be seen there are the three expected types: character, double and logical.
NA
Percentaje of NA values
Define function to see the amount of NA values in the dataframe.
percent_of_NA <-
function(df){
num_of_NA <-
df %>% is.na() %>% sum()
num_of_values <-
df %>% dim() %>% prod()
percent_of_NA <-
(num_of_NA / num_of_values) %>%
percent()
return(percent_of_NA)
}percent_of_NA(df)## [1] "37%"
Columns with NA
Define functions to see the NA in columns.
num_of_NA_by_column <-
function(df){
df %>% is.na() %>% colSums()
}remove_0 <-
function(x) x[x!=0]names_of_colums_with_NA <-
function(df)
df %>%
num_of_NA_by_column() %>%
remove_0 %>%
names()percentaje_of_cols_with_NA <-
function(df)
(length(names_of_colums_with_NA(df)) / ncol(df)) %>%
percent()Compute the percentaje of cols with NA.
percentaje_of_cols_with_NA(df)## [1] "74%"
Inspect if there are columns full of NA.
is_full_of_NA <- function(col){
num_of_NA <-
col %>%
is.na() %>%
sum()
return(num_of_NA == length(col))
}cols_full_of_NA <-
df %>%
select_if(is_full_of_NA) %>%
names()## [1] "authentihash" "scans.Bkav.result"
## [3] "scans.CMC.result" "scans.ALYac.result"
## [5] "scans.Malwarebytes.result" "scans.K7AntiVirus.result"
## [7] "scans.Baidu.result" "scans.SUPERAntiSpyware.result"
## [9] "scans.Gridinsoft.result" "scans.ViRobot.result"
## [11] "scans.BitDefenderTheta.result" "scans.TACHYON.result"
## [13] "scans.VBA32.result" "scans.Zoner.result"
## [15] "scans.Panda.result" "scans.Elastic.result"
## [17] "scans.Cylance.result" "scans.SentinelOne.result"
As can be seen there are many columns that are full of NA, so can be deleted.
df <-
df %>%
select(-all_of(cols_full_of_NA))Colums with the same value
Maybe there are columns that has the same value along all the vector, so are useless.
Define function to remove these columns.
remove_columns_with_the_same_value <-
function(df)
select_if(df, function(col) length(unique(col)) > 1)Apply function.
num_of_cols_after_remove <-
df %>%
remove_columns_with_the_same_value() %>%
ncol()Calculate the number of columns with same value.
ncol(df) - num_of_cols_after_remove## [1] 39
Awesom! Many colums found. Let’s remove them.
df <-
remove_columns_with_the_same_value(df)Inspecting dataframe
Now let’s deeply inspect into the dataframe.
Replace NA for 0 into permissions colums
The permissions columns (PERM) seems that there are NAs where there should be 0s. So it would be better to replace them.
replace_na_which_colname_match <-
function(df, pattern, replacement){
cols_to_replace <-
df %>%
colnames() %>%
str_which(pattern)
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}replace_na_when <-
function(df, fun, replacement){
cols_to_replace <-
df %>%
select_if(fun) %>%
colnames()
df_replaced_cols <-
df %>%
select(all_of(cols_to_replace)) %>%
sapply(function(col) replace_na(col, replacement))
df_without_replaced_cols <-
df %>%
select(-all_of(cols_to_replace))
return(cbind(df_without_replaced_cols, df_replaced_cols))
}pattern <- "additional_info.androguard.RiskIndicator.PERM"
df <-
df %>%
replace_na_which_colname_match(pattern, 0)
df %>%
select(., str_which(colnames(.), pattern)) %>%
paged_table()df_permissions <-
df %>%
select(., str_which(colnames(.), "additional_info.androguard.RiskIndicator.PERM")) %>%
sapply(function(col) replace_na(col, 0))
df_without_permissions <-
df %>%
select(., -c(str_which(colnames(.), "additional_info.androguard.RiskIndicator.PERM")))
df <- cbind(df_permissions, df_without_permissions)As factor
labels <-
function(n){
if(n == 5){
return(c("very low", "low", "medium", "high", "very high"))
}else if(n == 4){
return(c("very low", "low", "high", "very high"))
}else if(n == 3){
return(c("low", "medium", "high"))
}else if(n == 2){
return(c("low", "high"))
}else{
stop("Not avalible")
}
}
cut_by_quantiles <-
function(col){
quantiles <-
col %>%
quantile(na.rm = TRUE) %>%
unique()
if(length(quantiles) > 2){
col <-
col %>%
cut(breaks = quantiles,
labels = labels(length(quantiles)-1),
include.lowest = TRUE)
}
return(col)
}
df_cut_by_quantiles <-
function(df){
df_without_numeric <-
df[sapply(df, function(col) !is.numeric(col))]
df_numeric <-
df %>%
select_if(is.numeric)
df_numeric <-
df_numeric %>%
lapply(cut_by_quantiles)
return(cbind(df_without_numeric, df_numeric))
}Col total permissions
pattern <- "additional_info.androguard.RiskIndicator.PERM"
df_without_permissions <-
df %>%
select(., -(str_which(colnames(.), pattern)))
df_permissions <-
df %>%
select(., str_which(colnames(.), pattern)) %>%
mutate(., total_PERMs = rowSums(.))
df <- cbind(df_without_permissions, df_permissions)